Effect of UPSTM-Based
Decorrelation on Feature Discovery
Loading the
libraries
library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)
library(psych)
library(whitening)
library("vioplot")
library("rpart")
op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)
Material and
Methods
Data Source https://archive.ics.uci.edu/ml/datasets/seeds
M. Charytanowicz, J. Niewczas, P. Kulczycki, P.A. Kowalski, S.
Lukasik, S. Zak, ‘A Complete Gradient Clustering Algorithm for Features
Analysis of X-ray Images’, in: Information Technologies in Biomedicine,
Ewa Pietka, Jacek Kawa (eds.), Springer-Verlag, Berlin-Heidelberg, 2010,
pp. 15-24.
The Data
seeds <- read.delim("~/GitHub/LatentBiomarkers/Data/seeds_dataset.txt", header=FALSE)
par(cex=0.5)
featnames <- c("area",
"perimeter",
"compactness",
"length_of_kernel",
"width_of_kernel",
"asymmetry_coeff",
"length_ker_groove",
"class"
)
colnames(seeds) <- featnames
#seeds$class <- 1*(seeds$class == 1)
pander::pander(table(seeds$class))
Comparing IDeA vs PCA
vs EFA
IDeA
IDeASeeds <- IDeA(seeds)
plot(IDeASeeds[,colnames(IDeASeeds)!="class"],col=seeds$class,cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)

pander::pander(attr(IDeASeeds,"UPSTM"))
| area |
1 |
-0.446 |
0.228 |
-0.360 |
-0.146 |
| perimeter |
0 |
1.000 |
-0.836 |
0.524 |
0.000 |
| length_of_kernel |
0 |
0.000 |
1.000 |
0.000 |
0.000 |
| width_of_kernel |
0 |
0.000 |
0.000 |
1.000 |
0.000 |
| length_ker_groove |
0 |
0.000 |
0.000 |
0.000 |
1.000 |
pander::pander(getLatentCoefficients(IDeASeeds))
La_perimeter:
La_length_of_kernel:
La_width_of_kernel:
La_length_ker_groove:
IDeACor <- cor(IDeASeeds[,colnames(IDeASeeds)!="class"])
gplots::heatmap.2(abs(IDeACor),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "IDeA Correlation",
cexRow = 0.5,
cexCol = 0.5,
srtCol=45,
srtRow= -45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")

pander::pander(colnames(IDeACor))
area, La_perimeter, compactness,
La_length_of_kernel, La_width_of_kernel,
asymmetry_coeff and La_length_ker_groove
diag(IDeACor) <- 0;
getMaxcor <- apply(IDeACor,2,max)
topCorrelated <- getMaxcor[which.max(getMaxcor)]
whotoMax <- getMaxcor[getMaxcor == topCorrelated]
plot(IDeASeeds[,names(whotoMax)],main="IDeA: Top Correlated variables")

plot(IDeASeeds[,c("area","La_perimeter")],main="IDeA: Top Raw Correlated variables")

PCA
featuresnames <- colnames(seeds)[colnames(seeds) != "class"]
pc <- prcomp(seeds[,featuresnames],center = TRUE,tol=0.01) #principal components
PCAseed <- as.data.frame(cbind(predict(pc,seeds[,featuresnames]),class=seeds$class))
plot(PCAseed[,colnames(PCAseed)!="class"],col=seeds$class,cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)

pander::pander(pc$rotation)
| area |
-0.88423 |
0.10081 |
0.2645 |
0.1994 |
-0.13717 |
0.28064 |
| perimeter |
-0.39541 |
0.05649 |
-0.2825 |
-0.5788 |
0.57476 |
-0.30156 |
| compactness |
-0.00431 |
-0.00289 |
0.0590 |
0.0578 |
-0.05310 |
-0.04523 |
| length_of_kernel |
-0.12854 |
0.03062 |
-0.4001 |
-0.4361 |
-0.78700 |
-0.11344 |
| width_of_kernel |
-0.11106 |
0.00237 |
0.3192 |
0.2342 |
-0.14480 |
-0.89627 |
| asymmetry_coeff |
0.12762 |
0.98941 |
0.0643 |
-0.0251 |
-0.00158 |
0.00329 |
| length_ker_groove |
-0.12897 |
0.08223 |
-0.7619 |
0.6134 |
0.08765 |
-0.10992 |
PCACor <- cor(PCAseed[,colnames(PCAseed)!="class"])
gplots::heatmap.2(abs(PCACor),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "PCA Correlation",
cexRow = 0.5,
cexCol = 0.5,
srtCol=45,
srtRow= -45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")

EFA
featuresnames <- colnames(seeds)[colnames(seeds) != "class"]
uls <- fa(seeds[,featuresnames],length(featuresnames)-1,rotate="varimax",warnings=FALSE) # EFA analysis
EFAseed <- as.data.frame(cbind(predict(uls,seeds[,featuresnames]),class=seeds$class))
plot(EFAseed[,colnames(EFAseed)!="class"],col=seeds$class,cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)

pander::pander(uls$weights)
| area |
0.22124 |
0.552 |
-2.025 |
2.009 |
0.699 |
3.97e-14 |
| perimeter |
0.06513 |
-1.640 |
5.327 |
-2.515 |
3.887 |
-6.30e-14 |
| compactness |
-0.26831 |
0.673 |
1.261 |
-2.093 |
-1.291 |
-9.32e-15 |
| length_of_kernel |
0.32135 |
-0.314 |
0.606 |
0.811 |
-5.108 |
1.22e-14 |
| width_of_kernel |
0.13570 |
1.124 |
-3.346 |
3.283 |
0.705 |
1.84e-14 |
| asymmetry_coeff |
0.00711 |
0.073 |
-0.261 |
0.171 |
-0.113 |
-8.50e-16 |
| length_ker_groove |
0.38754 |
0.338 |
-1.442 |
-2.401 |
0.474 |
-1.16e-15 |
EFACor <- cor(EFAseed[,colnames(EFAseed)!="class"])
gplots::heatmap.2(abs(EFACor),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "EFA Correlation",
cexRow = 0.5,
cexCol = 0.5,
srtCol=45,
srtRow= -45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")

Effect on CAR
modeling
par(op)
par(xpd = TRUE)
seeds$class <- factor(seeds$class)
rawmodel <- rpart(class~.,seeds,control=rpart.control(maxdepth=3))
plot(rawmodel,main="Raw",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(rawmodel, use.n = TRUE,cex=0.75)

pr <- predict(rawmodel,seeds,type = "class")
pander::pander(table(seeds$class,pr))
pander::pander(c(accuracy=sum(seeds$class==pr)/nrow(seeds)))
IDeASeeds$class <- factor(IDeASeeds$class)
IDeAmodel <- rpart(class~.,IDeASeeds,control=rpart.control(maxdepth=3))
plot(IDeAmodel,main="IDeA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(IDeAmodel, use.n = TRUE,cex=0.75)

pr <- predict(IDeAmodel,IDeASeeds,type = "class")
pander::pander(table(IDeASeeds$class,pr))
pander::pander(c(accuracy=sum(IDeASeeds$class==pr)/nrow(seeds)))
PCAseed$class <- factor(PCAseed$class)
PCAmodel <- rpart(class~.,PCAseed,control=rpart.control(maxdepth=3))
plot(PCAmodel,main="PCA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(PCAmodel, use.n = TRUE,cex=0.75)

pr <- predict(PCAmodel,PCAseed,type = "class")
pander::pander(table(PCAseed$class,pr))
pander::pander(c(accuracy=sum(PCAseed$class==pr)/nrow(seeds)))
EFAseed$class <- factor(EFAseed$class)
EFAmodel <- rpart(class~.,EFAseed,control=rpart.control(maxdepth=3))
plot(EFAmodel,main="EFA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(EFAmodel, use.n = TRUE,cex=0.75)

pr <- predict(EFAmodel,EFAseed,type = "class")
pander::pander(table(EFAseed$class,pr))
pander::pander(c(accuracy=sum(EFAseed$class==pr)/nrow(seeds)))
par(op)
Generaring the
report
Libraries
Some libraries
library(psych)
library(whitening)
library("vioplot")
Data specs
pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
pander::pander(table(dataframe[,outcome]))
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
largeSet <- length(varlist) > 1000
Scaling the
data
Scaling and removing near zero variance columns and highly
co-linear(r>0.99999) columns
### Some global cleaning
sdiszero <- apply(dataframe,2,sd) > 1.0e-16
dataframe <- dataframe[,sdiszero]
varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
dataframe <- dataframe[,tokeep]
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
dataframe <- FRESAScale(dataframe,method="OrderLogit")$scaledData
The heatmap of the
data
if (!largeSet)
{
hm <- heatMaps(data=dataframe,
Outcome=outcome,
Scale=TRUE,
hCluster = "row",
xlab="Feature",
ylab="Sample",
srtCol=45,
srtRow=45,
cexCol=cexheat,
cexRow=cexheat
)
par(op)
}

Correlation
Matrix of the Data
The heat map of the data
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
#cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
cormat <- cor(dataframe[,varlist],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Original Correlation",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
diag(cormat) <- 0
print(max(abs(cormat)))
}
[1]
0.9944521
The
decorrelation
DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#>
#> Included: 7 , Uni p: 0.03711537 , Uncorrelated Base: 3 , Outcome-Driven Size: 0 , Base Size: 3
#>
#>
1 <R=0.994,w= 1,N= 3>, Top: 1( 1 )[ 1 : 1 : 0.972 ]( 1 , 1 , 0 ),<|>Tot Used: 2 , Added: 1 , Zero Std: 0 , Max Cor: 0.972
#>
2 <R=0.972,w= 2,N= 3>, Top: 1( 2 )[ 1 : 1 : 0.936 ]( 1 , 2 , 1 ),<|>Tot Used: 4 , Added: 2 , Zero Std: 0 , Max Cor: 0.862
#>
3 <R=0.862,w= 3,N= 4>, Top: 2( 1 )[ 1 : 2 : 0.831 ]( 2 , 2 , 1 ),<|>Tot Used: 5 , Added: 2 , Zero Std: 0 , Max Cor: 0.809
#>
4 <R=0.809,w= 4,N= 2>, Top: 1( 1 )[ 1 : 1 : 0.800 ]( 1 , 1 , 2 ),<|>Tot Used: 5 , Added: 1 , Zero Std: 0 , Max Cor: 0.715
#>
5 <R=0.000,w= 5,N= 0>
#>
[ 5 ], 0.5987399 Decor Dimension: 5 . Cor to Base: 4 , ABase: 1 , Outcome Base: 0
#>
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]
pander::pander(sum(apply(dataframe[,varlist],2,var)))
4.75
pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))
2.48
pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))
5.01
pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))
4.55
The decorrelation
matrix
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
UPSTM <- attr(DEdataframe,"UPSTM")
gplots::heatmap.2(1.0*(abs(UPSTM)>0),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Decorrelation matrix",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Beta|>0",
xlab="Output Feature", ylab="Input Feature")
par(op)
}

The correlation
matrix after decorrelation
if (!largeSet)
{
cormat <- cor(DEdataframe[,varlistc],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Correlation after IDeA",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
par(op)
diag(cormat) <- 0
print(max(abs(cormat)))
}
[1]
0.714948
U-MAP Visualization
of features
The UMAP based on
LASSO on Raw Data
classes <- unique(dataframe[,outcome])
raincolors <- rainbow(length(classes))
names(raincolors) <- classes
datasetframe.umap = umap(scale(dataframe[,varlist]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
text(datasetframe.umap$layout,labels=dataframe[,outcome],col=raincolors[dataframe[,outcome]+1])

The decorralted
UMAP
datasetframe.umap = umap(scale(DEdataframe[,varlistc]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
text(datasetframe.umap$layout,labels=DEdataframe[,outcome],col=raincolors[DEdataframe[,outcome]+1])

Univariate
Analysis
Univariate
univarRAW <- uniRankVar(varlist,
paste(outcome,"~1"),
outcome,
dataframe,
rankingTest="AUC")
univarDe <- uniRankVar(varlistc,
paste(outcome,"~1"),
outcome,
DEdataframe,
rankingTest="AUC",
)
Final Table
univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")
##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
| asymmetry_coeff |
-0.5636 |
0.690 |
0.348 |
0.800 |
9.86e-01 |
0.810 |
| length_ker_groove |
-0.2900 |
0.421 |
0.464 |
0.781 |
2.62e-03 |
0.764 |
| compactness |
0.2829 |
0.685 |
-0.273 |
1.041 |
5.53e-01 |
0.653 |
| length_of_kernel |
-0.0626 |
0.430 |
0.257 |
0.919 |
3.80e-04 |
0.562 |
| perimeter |
-0.0320 |
0.341 |
0.192 |
0.888 |
1.91e-05 |
0.524 |
topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]
theLaVar <- rownames(finalTable)[str_detect(rownames(finalTable),"La_")]
pander::pander(univarDe$orderframe[topLAvar,univariate_columns])
| La_length_ker_groove |
-0.27554 |
0.293 |
0.3092 |
0.266 |
0.851 |
0.929 |
| asymmetry_coeff |
-0.56364 |
0.690 |
0.3484 |
0.800 |
0.986 |
0.810 |
| La_width_of_kernel |
-0.00490 |
0.103 |
-0.1023 |
0.101 |
0.850 |
0.749 |
| compactness |
0.28287 |
0.685 |
-0.2733 |
1.041 |
0.553 |
0.653 |
| La_length_of_kernel |
-0.00498 |
0.108 |
0.0363 |
0.142 |
0.469 |
0.607 |
dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")
theSigDc <- dc[theLaVar]
names(theSigDc) <- NULL
theSigDc <- unique(names(unlist(theSigDc)))
theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)
pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
allSigvars <- names(dc)
dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
coef <- theFormulas[[dx]]
cname <- names(theFormulas[[dx]])
names(cname) <- cname
for (cf in names(coef))
{
if (cf != dx)
{
if (coef[cf]>0)
{
deFromula[dx] <- paste(deFromula[dx],
sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
}
else
{
deFromula[dx] <- paste(deFromula[dx],
sprintf("%5.3f*%s",coef[cf],cname[cf]))
}
}
}
}
finalTable <- rbind(finalTable,univarRAW$orderframe[theSigDc[!(theSigDc %in% rownames(finalTable))],univariate_columns])
orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]
Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")
finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
| La_length_ker_groove |
-0.873area + 1.000length_ker_groove |
-0.27554 |
0.293 |
0.3092 |
0.266 |
8.51e-01 |
0.929 |
0.764 |
-1 |
| asymmetry_coeff |
|
-0.56364 |
0.690 |
0.3484 |
0.800 |
9.86e-01 |
0.810 |
0.810 |
NA |
| length_ker_groove |
NA |
-0.28998 |
0.421 |
0.4639 |
0.781 |
2.62e-03 |
0.764 |
0.764 |
NA |
| La_width_of_kernel |
-2.936area + 1.922perimeter +
1.000*width_of_kernel |
-0.00490 |
0.103 |
-0.1023 |
0.101 |
8.50e-01 |
0.749 |
0.501 |
-2 |
| compactness |
|
0.28287 |
0.685 |
-0.2733 |
1.041 |
5.53e-01 |
0.653 |
0.653 |
NA |
| La_length_of_kernel |
+ 1.602area -2.629perimeter +
1.000*length_of_kernel |
-0.00498 |
0.108 |
0.0363 |
0.142 |
4.69e-01 |
0.607 |
0.562 |
-2 |
| length_of_kernel |
NA |
-0.06258 |
0.430 |
0.2567 |
0.919 |
3.80e-04 |
0.562 |
0.562 |
NA |
| perimeter |
NA |
-0.03199 |
0.341 |
0.1918 |
0.888 |
1.91e-05 |
0.524 |
0.524 |
NA |
| area |
NA |
-0.01654 |
0.324 |
0.1772 |
0.896 |
1.24e-05 |
0.515 |
0.515 |
4 |
| width_of_kernel |
NA |
0.00803 |
0.385 |
0.0493 |
0.947 |
2.79e-03 |
0.501 |
0.501 |
NA |